In [3]:
import warnings
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import os
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics import tsaplots
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from prophet import Prophet
from prophet.plot import plot_components
In [76]:
# Step 1: Load the CSV as strings
df = pd.read_csv('mixed_dataset.csv', dtype=str)

# Step 2: Rename columns
df.rename(columns={
    'dt': 'Date',
    'LandAverageTemperature': 'LandAvgTemp'
}, inplace=True)

# Step 3: Split based on known format
# Format A: YYYY-MM-DD (before 1900)
df_iso = df[df['Date'].str.contains(r'\d{4}-\d{2}-\d{2}', na=False)].copy()
df_iso['Date'] = pd.to_datetime(df_iso['Date'], format='%Y-%m-%d', errors='coerce')

# Format B: DD/MM/YYYY (after 1900)
df_dmy = df[~df.index.isin(df_iso.index)].copy()
df_dmy['Date'] = pd.to_datetime(df_dmy['Date'], format='%d/%m/%Y', errors='coerce')

# Step 4: Combine both parsed subsets
df = pd.concat([df_iso, df_dmy])

# Step 5: Drop NaTs, filter, and clean
df.dropna(subset=['Date'], inplace=True)
df = df[df['Date'] >= '1900-01-01']
df['LandAvgTemp'] = pd.to_numeric(df['LandAvgTemp'], errors='coerce')
df.dropna(subset=['LandAvgTemp'], inplace=True)
df.set_index('Date', inplace=True)
df.sort_index(inplace=True)
In [77]:
# 9. Plot the Entire Time Series
plt.figure(figsize=(12, 5))
plt.plot(df.index, df['LandAvgTemp'], label='Global Land Avg Temp (°C)', alpha=0.99, linewidth=0.8)
plt.title('Global Land Average Temperature (Jan 1900 to Dec 2015)')
plt.xlabel('Year')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.grid(True)

plt.show()
No description has been provided for this image
In [5]:
data = df['LandAvgTemp'].copy()
In [6]:
# Calculate the rolling average to estimate the trend (12 months = 1 year)
trend = data.rolling(window=12, center=True).mean()

# Plot the trend
plt.figure(figsize=(10, 4))
plt.plot(data, label='Original', alpha = 0.4, )
plt.plot(trend, label='Trend (12-mo Rolling Mean)', color='red')
plt.title('Trend Estimation')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [7]:
# Resample the temperature data to yearly frequency and compute the mean
yearly_data = data.resample('Y').mean()

# Plot the yearly average trend
plt.figure(figsize=(10, 4))
plt.plot(yearly_data, label='Yearly Avg Temp (Trend)', color='orange')
plt.title('Global Land Avg Temperature - Yearly Average Trend')
plt.xlabel('Year')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.grid(True)
plt.show()
<ipython-input-7-b109af4205ee>:2: FutureWarning: 'Y' is deprecated and will be removed in a future version, please use 'YE' instead.
  yearly_data = data.resample('Y').mean()
No description has been provided for this image
In [8]:
# Step 1: Compute 12-month rolling average (the trend)
rolling_trend = df['LandAvgTemp'].rolling(window=12, center=True).mean()

# Step 2: Align data by dropping NaNs introduced by rolling
valid_index = rolling_trend.dropna().index
aligned_data = df['LandAvgTemp'].loc[valid_index]
aligned_trend = rolling_trend.loc[valid_index]

# Step 3: Subtract trend from data
detrended = aligned_data - aligned_trend

# Step 4: Plot detrended series
plt.figure(figsize=(12, 5))
plt.plot(detrended.index, detrended, label='Detrended Series', color='orange')
plt.title('Detrended Temperature Series (Original - 12-Month Rolling Trend)')
plt.xlabel('Year')
plt.ylabel('Detrended Temperature (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [17]:
# Step 1: Extract the month (1–12) for each detrended value
months = pd.Series(detrended.index.month, index=detrended.index)

# Step 2: Group detrended values by month and compute the mean for each month
monthly_seasonal_values = detrended.groupby(months).mean()

# Step 3: Map the monthly seasonality values back to each data point
seasonality = months.map(monthly_seasonal_values)

# Step 4: Plot the seasonality
plt.figure(figsize=(12, 5))
plt.plot(detrended.index, seasonality, label='Estimated Seasonality', color='green')
plt.title('Seasonality Component (Manual Estimation)')
plt.xlabel('Year')
plt.ylabel('Seasonal Effect (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

print("\nMonthly Seasonal Averages:")
print(monthly_seasonal_values.round(3))
No description has been provided for this image
Monthly Seasonal Averages:
Date
1    -5.932
2    -5.427
3    -3.366
4    -0.222
5     2.633
6     4.779
7     5.687
8     5.197
9     3.403
10    0.759
11   -2.545
12   -4.961
Name: LandAvgTemp, dtype: float64
In [10]:
# Filter the period 1950–2000 for better visualization
zoom_range = seasonality[(seasonality.index.year >= 1955) & (seasonality.index.year <= 1980)]

plt.figure(figsize=(14, 5))
plt.plot(zoom_range.index, zoom_range.values, label='Estimated Seasonality (1955–1980)', color='green')
plt.title('Seasonality Component (Zoomed In: 1955–1980)')
plt.xlabel('Year')
plt.ylabel('Seasonal Effect (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [11]:
# Residual = Original - Trend - Seasonality
residual = aligned_data - aligned_trend - seasonality

# Plot residuals
plt.figure(figsize=(10, 4))
plt.plot(residual, label='Residuals (Noise)', color='red')
plt.title('Residual Component (Original - Trend - Seasonality)')
plt.xlabel('Year')
plt.ylabel('Residual (°C)')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [12]:
from statsmodels.tsa.stattools import adfuller

# Run the ADF test
adf_result = adfuller(residual.dropna())

# Print results
print("ADF Statistic:", adf_result[0])
print("p-value:", adf_result[1])
print("Critical Values:")
for key, value in adf_result[4].items():
    print(f"   {key}: {value}")
ADF Statistic: -15.658691940308238
p-value: 1.5637460312374347e-28
Critical Values:
   1%: -3.4351745242248715
   5%: -2.8636706623947417
   10%: -2.567904365598721

part 2:¶

In [13]:
# 12-month seasonal differencing to remove yearly seasonality
data_diff = df['LandAvgTemp'].diff(periods=12)
data_diff = data_diff.dropna()

# Plot the differenced series
plt.figure(figsize=(10, 5))
plt.plot(data_diff, label='12-Month Differenced Series', color='purple')
plt.title("Plot of Differenced Series (Lag=12)", fontsize=15)
plt.xlabel('Year')
plt.ylabel('Differenced Temp (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [14]:
# ACF plot (lags=25)
plt.figure(figsize=(2, 2))
sm.graphics.tsa.plot_acf(data_diff, lags=25)
plt.title('ACF of Seasonally Differenced Series')
plt.tight_layout()
plt.show()
<Figure size 200x200 with 0 Axes>
No description has been provided for this image
In [15]:
# PACF plot (lags=25)
plt.figure(figsize=(2, 2))
sm.graphics.tsa.plot_pacf(data_diff, lags=25, method='ywm')
plt.title('PACF of Seasonally Differenced Series')
plt.tight_layout()
plt.show()
plt.show()
<Figure size 200x200 with 0 Axes>
No description has been provided for this image
In [18]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
# Define the list of candidate models: (p, d, q, P, D, Q, s)
candidates = [
    (1, 1, 1, 1, 1, 1, 12),
    (2, 1, 2, 1, 1, 1, 12),
    (2, 1, 1, 2, 1, 1, 12),
    (1, 1, 2, 1, 1, 2, 12),
    (2, 1, 0, 2, 1, 1, 12)
]

best_bic = np.inf
best_model = None
best_order = None

print("BIC for each SARIMA candidate:\n")

for order in candidates:
    p, d, q, P, D, Q, s = order
    try:
        model = SARIMAX(df['LandAvgTemp'],
                        order=(p, d, q),
                        seasonal_order=(P, D, Q, s),
                        enforce_stationarity=False,
                        enforce_invertibility=False)
        result = model.fit(disp=False)
        bic = result.bic
        print(f"SARIMA({p},{d},{q})({P},{D},{Q})[{s}] --> BIC: {bic:.2f}")

        if bic < best_bic:
            best_bic = bic
            best_model = result
            best_order = order

    except Exception as e:
        print(f"Model SARIMA({p},{d},{q})({P},{D},{Q})[{s}] failed: {e}")

# Final chosen model
if best_model:
    print("\n Best model based on BIC:")
    p, d, q, P, D, Q, s = best_order
    print(f"Best SARIMA({p},{d},{q})({P},{D},{Q})[{s}] with BIC: {best_bic:.2f}")

# Optional: Plot diagnostics for the best model
best_model.plot_diagnostics(lags=20, figsize=(15, 12))
plt.show()
BIC for each SARIMA candidate:

/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
SARIMA(1,1,1)(1,1,1)[12] --> BIC: 611.79
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
SARIMA(2,1,2)(1,1,1)[12] --> BIC: 590.28
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
SARIMA(2,1,1)(2,1,1)[12] --> BIC: 597.79
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
SARIMA(1,1,2)(1,1,2)[12] --> BIC: 580.44
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
SARIMA(2,1,0)(2,1,1)[12] --> BIC: 776.36

 Best model based on BIC:
Best SARIMA(1,1,2)(1,1,2)[12] with BIC: 580.44
No description has been provided for this image
In [19]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Define the model
model = SARIMAX(df['LandAvgTemp'],
                order=(0, 1, 2),
                seasonal_order=(0, 1, 1, 12),
                enforce_stationarity=False,
                enforce_invertibility=False)

# Fit the model
result = model.fit(disp=False)

# Print BIC
print(f"BIC for SARIMA(0,1,2)(0,1,1)[12]: {result.bic:.2f}")
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
BIC for SARIMA(0,1,2)(0,1,1)[12]: 620.40
In [20]:
# Make sure best_model_name is defined (if not carried over from earlier cell)
best_model_name = "SARIMA(1,1,2)(1,1,2)[12]"  # or whatever string you want for display

# Step 1: Define forecast horizon
FORECAST_STEPS = 48  # Forecast 4 years

# Step 2: Generate forecast from the best model
forecast_result = best_model.get_forecast(steps=FORECAST_STEPS)
forecast_mean = forecast_result.predicted_mean
forecast_ci = forecast_result.conf_int()

# Step 3: Create future datetime index
last_date = df.index[-1]
future_index = pd.date_range(start=last_date + pd.DateOffset(months=1),
                             periods=FORECAST_STEPS, freq='MS')

# Step 4: Create forecast DataFrame
forecast_df = pd.DataFrame(forecast_mean.values, index=future_index, columns=['Forecast'])

# Step 5: Plot the observed series with forecast
plt.figure(figsize=(20, 10))
start_zoom = '1995-01-01'
plt.plot(df['LandAvgTemp'].loc[start_zoom:], label='Observed', linewidth=2)
plt.plot(forecast_df, label='Forecast (Next 4 Years)', color='orange', linewidth=2)

# Improved confidence interval shading
plt.fill_between(forecast_ci.index,
                 forecast_ci.iloc[:, 0],
                 forecast_ci.iloc[:, 1],
                 color='dimgray', alpha=0.6, label='95% Confidence Interval')

# Labels and title
plt.xlabel('Date', fontsize=20)
plt.ylabel('Temperature (°C)', fontsize=20)
plt.title(f"SARIMA Forecast using {best_model_name}", fontsize=20)

# Improved legend styling
plt.legend(loc='upper left', fontsize=12, frameon=True, fancybox=True, framealpha=1, borderpad=1)

plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Use the cleaned monthly data (with equal spacing)
y = df['LandAvgTemp'].copy()
y = y.dropna()
y = y.values
n = len(y)
T = 12  # Monthly seasonality
K = T // 2  # Number of harmonics = 6

# Time index
t = np.arange(1, n + 1)

# Estimate alpha_0
alpha_0 = np.mean(y)

# Initialize arrays
alpha_hat = []
beta_hat = []

for k in range(1, K + 1):
    lam_k = 2 * np.pi * k / T
    alpha_k = (2 / n) * np.sum(y * np.cos(lam_k * t))
    beta_k = (2 / n) * np.sum(y * np.sin(lam_k * t))
    alpha_hat.append(alpha_k)
    beta_hat.append(beta_k)

# Show estimated coefficients
print(f"α₀ = {alpha_0:.4f}")
for k in range(1, K + 1):
    print(f"α̂_{k} = {alpha_hat[k-1]:.4f}, β̂_{k} = {beta_hat[k-1]:.4f}")
α₀ = 8.7625
α̂_1 = -4.8502, β̂_1 = -3.3440
α̂_2 = -0.0743, β̂_2 = -0.1629
α̂_3 = -0.0151, β̂_3 = 0.0514
α̂_4 = -0.0339, β̂_4 = 0.0607
α̂_5 = 0.0012, β̂_5 = 0.0099
α̂_6 = 0.0434, β̂_6 = -0.0000
In [22]:
print("alpha_hat:", alpha_hat)
print("beta_hat:", beta_hat)
print("K =", len(alpha_hat))
alpha_hat: [np.float64(-4.850170476452435), np.float64(-0.07431824712645876), np.float64(-0.015107758620683429), np.float64(-0.033861350574697524), np.float64(0.0012092695557568498), np.float64(0.04341666666666653)]
beta_hat: [np.float64(-3.343972751086735), np.float64(-0.162893654720736), np.float64(0.051449712643674234), np.float64(0.06073252001914624), np.float64(0.009892291316795332), np.float64(-2.0070870531230247e-14)]
K = 6
In [23]:
# Compute and print I_k values
I_k = [(n / 2) * (a**2 + b**2) for a, b in zip(alpha_hat, beta_hat)]

print("I_k values (Fourier Periodogram):")
for k, ik in enumerate(I_k, 1):
    period = int(T / k)
    print(f"Harmonic k={k} (Period = {period}): I_k = {ik:.2f}")

# Dot-style scatter plot (like the slide)
plt.figure(figsize=(10, 5))
plt.scatter(range(1, K+1), I_k, color='red', s=80)
plt.xticks(range(1, K+1), [f'λ{k}\n(p={int(T/k)})' for k in range(1, K+1)])
plt.ylabel('$I_k$', fontsize=12)
plt.xlabel('Harmonic (k)', fontsize=12)
plt.title('Discrete Periodogram (Fourier Coefficients)', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.show()
I_k values (Fourier Periodogram):
Harmonic k=1 (Period = 12): I_k = 24155.59
Harmonic k=2 (Period = 6): I_k = 22.31
Harmonic k=3 (Period = 4): I_k = 2.00
Harmonic k=4 (Period = 3): I_k = 3.37
Harmonic k=5 (Period = 2): I_k = 0.07
Harmonic k=6 (Period = 2): I_k = 1.31
No description has been provided for this image
In [24]:
from scipy.stats import f

# Step 1: Reconstruct fitted values using the Fourier series
lam = [2 * np.pi * k / T for k in range(1, K+1)]
y_hat = np.full(n, alpha_0)

for k in range(K):
    y_hat += alpha_hat[k] * np.cos(lam[k] * t) + beta_hat[k] * np.sin(lam[k] * t)

# Step 2: Residual sum of squares
residuals = y - y_hat
SSRes = np.sum(residuals**2)

# Step 3: Explained sum of squares (from I_k)
SSReg = np.sum(I_k)

# Step 4: Global F-test (Test 1: Are there any cycles?)
F_global = (SSReg / (T - 1)) / (SSRes / (n - T))
F_crit_global = f.ppf(0.95, T - 1, n - T)

print(f"Global F-statistic: {F_global:.2f}")
print(f"Critical value (95%): {F_crit_global:.2f}")
print("Reject H0 (no cycles)?", F_global > F_crit_global)

# Step 5: Individual tests (Test 2: Is cycle k significant?)
F_k = [(ik / 2) / (SSRes / (n - T)) for ik in I_k]
F_crit_individual = f.ppf(0.95, 2, n - T)

print("\nIndividual F-tests for each harmonic:")
for k in range(1, K+1):
    reject = F_k[k-1] > F_crit_individual
    print(f"Cycle k={k} (period {int(T/k)}): F={F_k[k-1]:.2f} → Reject H0? {reject}")
Global F-statistic: 8585.02
Critical value (95%): 1.80
Reject H0 (no cycles)? True

Individual F-tests for each harmonic:
Cycle k=1 (period 12): F=47160.88 → Reject H0? True
Cycle k=2 (period 6): F=43.56 → Reject H0? True
Cycle k=3 (period 4): F=3.91 → Reject H0? True
Cycle k=4 (period 3): F=6.57 → Reject H0? True
Cycle k=5 (period 2): F=0.13 → Reject H0? False
Cycle k=6 (period 2): F=2.56 → Reject H0? False
In [25]:
# Zoom into the last 300 months (25 years)
plt.figure(figsize=(14, 5))
plt.plot(t[-300:], y[-300:], label='Actual Data', linewidth=2)
plt.plot(t[-300:], y_hat[-300:], label='Fourier Fit', linestyle='--', linewidth=2, color='orange')
plt.title('Fourier Regression Fit (Zoomed View: Last 25 Years)')
plt.xlabel('Time (Months)')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [26]:
# Step 0: Recompute lambda_k
lambda_k = [2 * np.pi * k / T for k in range(1, K + 1)]

# Step 1: Forecasting horizon
forecast_horizon = 48  # Next 4 years
t_future = np.arange(n + 1, n + forecast_horizon + 1)

# Step 2: Forecast using Fourier model
y_forecast = np.full(forecast_horizon, alpha_0)
for k in range(K):
    y_forecast += alpha_hat[k] * np.cos(lambda_k[k] * t_future) + beta_hat[k] * np.sin(lambda_k[k] * t_future)

# Step 3: Combine for plotting
t_combined = np.concatenate([t[-300:], t_future])
y_combined = np.concatenate([y[-300:], y_forecast])
y_fit_combined = np.concatenate([y_hat[-300:], [np.nan]*forecast_horizon])  # fit only for real data

# Step 4: Plot zoomed view
plt.figure(figsize=(20, 8))
plt.plot(t[-300:], y[-300:], label='Observed Data (last 25 years)', linewidth=2)
plt.plot(t[-300:], y_hat[-300:], label='Fourier Fit', linestyle='--', color='orange', linewidth=2)
plt.plot(t_future, y_forecast, label='Fourier Forecast (next 4 years)', linestyle='--', color='green', linewidth=2)

plt.axvline(x=n, color='gray', linestyle=':', linewidth=2, label='Forecast Start')
plt.title('Fourier Forecast with Recent Observations (25 Years + 4-Year Forecast)', fontsize=16)
plt.xlabel('Time (Months)')
plt.ylabel('Temperature (°C)')
plt.legend(loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [27]:
residuals = y - y_hat
plt.figure(figsize=(12, 4))
plt.plot(residuals, label='Residuals')
plt.title('Residuals from Fourier-Only Model')
plt.xlabel('Time (Months)')
plt.ylabel('Residual')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [28]:
plt.figure(figsize=(10, 5))
plot_acf(residuals, lags=50, title='ACF of Residuals – Fourier-Only Model')
plt.grid(True)
plt.tight_layout()
plt.show()
<Figure size 1000x500 with 0 Axes>
No description has been provided for this image
In [29]:
# Step 1: Prepare time variable (t already defined from 1 to n)

# Step 2: Create Fourier basis matrix
X_fourier = []
for i in range(n):
    row = [1, t[i]]  # Intercept and time (trend)
    for k in range(1, K + 1):
        lam = 2 * np.pi * k / T
        row.append(np.cos(lam * t[i]))
        row.append(np.sin(lam * t[i]))
    X_fourier.append(row)

# Convert to numpy matrix
X = np.array(X_fourier)  # Shape: (n, 2 + 2*K)

# Step 3: Solve for regression coefficients using normal equation
# theta_hat = (X^T X)^(-1) X^T y
XtX = X.T @ X
Xty = X.T @ y
theta_hat = np.linalg.inv(XtX) @ Xty

# Display theta_hat
print("Estimated regression coefficients:")
for i, coeff in enumerate(theta_hat):
    print(f"θ[{i}] = {coeff:.4f}")
Estimated regression coefficients:
θ[0] = 8.1417
θ[1] = 0.0009
θ[2] = -4.8511
θ[3] = -3.3407
θ[4] = -0.0752
θ[5] = -0.1613
θ[6] = -0.0160
θ[7] = 0.0524
θ[8] = -0.0348
θ[9] = 0.0613
θ[10] = 0.0003
θ[11] = 0.0101
θ[12] = 0.0206
θ[13] = -7178562806.5512
In [30]:
# Step 1: Reconstruct y_hat from theta_hat
y_reg_fit = np.zeros(n)

for i in range(n):
    # Start with intercept + trend
    pred = theta_hat[0] + theta_hat[1] * t[i]

    # Add Fourier terms manually
    for k in range(1, K + 1):
        lam = 2 * np.pi * k / T
        pred += theta_hat[2 * k] * np.cos(lam * t[i]) + theta_hat[2 * k + 1] * np.sin(lam * t[i])

    y_reg_fit[i] = pred

# Step 2: Plot result
plt.figure(figsize=(16, 8))
plt.plot(t[-300:], y[-300:], label='Actual Data (last 25 years)', linewidth=2)
plt.plot(t[-300:], y_reg_fit[-300:], label='Fourier + Trend Fit', linestyle='--', linewidth=2, color='red')
plt.title('Fourier + Trend Regression Fit (Last 25 Years)')
plt.xlabel('Time (Months)')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [31]:
# Plot the difference between the two models (with and without trend)
plt.figure(figsize=(12, 3))
plt.plot(t, y_reg_fit - y_hat, label='Difference: (Fourier+Trend) - Fourier')
plt.axhline(0, color='gray', linestyle='--')
plt.title('How Much the Trend Affects the Prediction')
plt.xlabel('Time (Months)')
plt.ylabel('Δ Prediction (°C)')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [32]:
# Step 1: Define future time steps
forecast_horizon = 48  # next 48 months
t_future = np.arange(n + 1, n + forecast_horizon + 1)

# Step 2: Forecast manually using theta_hat
y_future = np.zeros(forecast_horizon)

for i in range(forecast_horizon):
    t_val = t_future[i]
    pred = theta_hat[0] + theta_hat[1] * t_val
    for k in range(1, K + 1):
        lam = 2 * np.pi * k / T
        pred += theta_hat[2 * k] * np.cos(lam * t_val) + theta_hat[2 * k + 1] * np.sin(lam * t_val)
    y_future[i] = pred

# Step 3: Plot full model with forecast
plt.figure(figsize=(16, 10))
plt.plot(t[-300:], y[-300:], label='Observed (Last 25 Years)', linewidth=2)
plt.plot(t[-300:], y_reg_fit[-300:], label='Fourier + Trend Fit', linestyle='--', color='green', linewidth=2)
plt.plot(t_future, y_future, label='Forecast (Next 4 Years)', linestyle='--', color='purple', linewidth=2)

plt.axvline(x=n, color='gray', linestyle=':', linewidth=2, label='Forecast Start')
plt.title('Fourier + Trend Forecast (Next 4 Years)', fontsize=16)
plt.xlabel('Time (Months)')
plt.ylabel('Temperature (°C)')
plt.legend(loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [33]:
# Step 1: Calculate residuals from the fitted model
residuals = y - y_reg_fit  # Actual - Fitted

# Step 2: Plot residuals
plt.figure(figsize=(14, 4))
plt.plot(t[-300:], residuals[-300:], color='gray', linewidth=1)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals from Fourier + Trend Model (Last 25 Years)')
plt.xlabel('Time (Months)')
plt.ylabel('Residual')
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [34]:
import scipy.stats as stats
# Q-Q Plot
plt.figure(figsize=(6, 6))
sm.qqplot(residuals, line='s', alpha=0.5)
plt.title("Normal Q-Q Plot of Residuals (Fourier + Trend)")
plt.grid(True)
plt.tight_layout()
plt.show()
<Figure size 600x600 with 0 Axes>
No description has been provided for this image
In [35]:
residuals = y - y_reg_fit  # Actual - Fitted
# ACF of residuals
plt.figure(figsize=(10, 4))
plot_acf(residuals, lags=40, alpha=0.05)
plt.title("ACF of Residuals (Fourier + Trend Model)")
plt.xlabel("Lag")
plt.ylabel("Autocorrelation")
plt.grid(True)
plt.tight_layout()
plt.show()
<Figure size 1000x400 with 0 Axes>
No description has been provided for this image
In [36]:
# Prepare data for Prophet by renaming columns: 'ds' for dates and 'y' for the target variable
df_prophet = df[['LandAvgTemp']].copy()
df_prophet.reset_index(inplace=True)
df_prophet.rename(columns={'Date': 'ds', 'LandAvgTemp': 'y'}, inplace=True)
df_prophet.head()
Out[36]:
ds y
0 1900-01-01 1.461
1 1900-02-01 3.098
2 1900-03-01 5.492
3 1900-04-01 8.223
4 1900-05-01 11.385
In [37]:
# Initialize and fit the Prophet model
# We disable daily and weekly seasonality since our data is monthly, but enable yearly seasonality.
model = Prophet(daily_seasonality=False, weekly_seasonality=False, yearly_seasonality=True)
model.fit(df_prophet)
DEBUG:cmdstanpy:input tempfile: /tmp/tmpntoh991y/vjbzn31h.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpntoh991y/lnwjflbg.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=34756', 'data', 'file=/tmp/tmpntoh991y/vjbzn31h.json', 'init=/tmp/tmpntoh991y/lnwjflbg.json', 'output', 'file=/tmp/tmpntoh991y/prophet_modelrbx2_evg/prophet_model-20250406120227.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
12:02:27 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
12:02:28 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
Out[37]:
<prophet.forecaster.Prophet at 0x789baea35690>
In [38]:
# Create a future dataframe for forecasting the next 48 months (monthly frequency)
future = model.make_future_dataframe(periods=48, freq='MS')
forecast = model.predict(future)
forecast.head()
Out[38]:
ds trend yhat_lower yhat_upper trend_lower trend_upper additive_terms additive_terms_lower additive_terms_upper yearly yearly_lower yearly_upper multiplicative_terms multiplicative_terms_lower multiplicative_terms_upper yhat
0 1900-01-01 8.334208 1.995847 2.792140 8.334208 8.334208 -5.944898 -5.944898 -5.944898 -5.944898 -5.944898 -5.944898 0.0 0.0 0.0 2.389310
1 1900-02-01 8.333123 2.540999 3.344873 8.333123 8.333123 -5.388569 -5.388569 -5.388569 -5.388569 -5.388569 -5.388569 0.0 0.0 0.0 2.944553
2 1900-03-01 8.332142 4.595095 5.406354 8.332142 8.332142 -3.339034 -3.339034 -3.339034 -3.339034 -3.339034 -3.339034 0.0 0.0 0.0 4.993108
3 1900-04-01 8.331057 7.722694 8.546308 8.331057 8.331057 -0.198296 -0.198296 -0.198296 -0.198296 -0.198296 -0.198296 0.0 0.0 0.0 8.132761
4 1900-05-01 8.330006 10.587851 11.434347 8.330006 8.330006 2.657274 2.657274 2.657274 2.657274 2.657274 2.657274 0.0 0.0 0.0 10.987280
In [39]:
# Prophet Forecast (last 24 months)
plt.figure(figsize=(10, 5))
plt.plot(forecast['ds'][-48:], forecast['yhat'][-48:], label='Forecast', color='red', alpha=0.7)
plt.fill_between(forecast['ds'][-48:],
                 forecast['yhat_lower'][-48:],
                 forecast['yhat_upper'][-48:],
                 color='red', alpha=0.2)
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.title('Prophet Forecast')
plt.legend()
plt.show()
No description has been provided for this image
In [40]:
# Prophet Full (entire fitted data + forecast)
plt.figure(figsize=(25, 5))
plt.plot(forecast['ds'], forecast['yhat'], label='Fitted data + Forecast', color='red', alpha=0.7)
plt.fill_between(forecast['ds'],
                 forecast['yhat_lower'],
                 forecast['yhat_upper'],
                 color='red', alpha=0.2)
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.title('Prophet Full')
plt.legend()
plt.show()
No description has been provided for this image
In [41]:
# Subset the forecast DataFrame to include dates from 2000 onward
forecast_subset = forecast[forecast['ds'] >= '1990-01-01']

# Plot the forecast (fitted data + forecast) from 2000 onward
plt.figure(figsize=(20, 6))
plt.plot(forecast_subset['ds'], forecast_subset['yhat'], label='Fitted Data + Forecast', color='red', alpha=0.7)
plt.fill_between(forecast_subset['ds'],
                 forecast_subset['yhat_lower'],
                 forecast_subset['yhat_upper'],
                 color='red', alpha=0.2)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Temperature (°C)', fontsize=14)
plt.title('Prophet Forecast last 25 years Onward', fontsize=16)
plt.legend(fontsize=12)
plt.show()
No description has been provided for this image
In [42]:
# Plot the forecast components: trend, yearly seasonality, etc.
fig2 = model.plot_components(forecast)
plt.show()
No description has been provided for this image
In [43]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
# -------------------------
# Step 1: Train/Test Split
# -------------------------
train_end = '2010-12-01'
test_start = '2011-01-01'
test_end = '2015-12-01'

y_train = df['LandAvgTemp'].loc[:train_end]
y_test = df['LandAvgTemp'].loc[test_start:test_end]

# -------------------------
# Step 2: SARIMA Model
# -------------------------
sarima_model = SARIMAX(y_train,
                       order=(1, 1, 2),
                       seasonal_order=(1, 1, 2, 12),
                       enforce_stationarity=False,
                       enforce_invertibility=False)
sarima_result = sarima_model.fit(disp=False)
sarima_forecast = sarima_result.get_forecast(steps=60).predicted_mean.values

# -------------------------
# Step 3: Fourier + Trend
# -------------------------
t_train = np.arange(len(y_train)) + 1
t_test = np.arange(len(y_train)+1, len(y_train)+61)
y_fit = y_train.values
T = 12
K = 6  # harmonics

# Design matrix for training
X_train = np.column_stack([np.ones_like(t_train), t_train])
for k in range(1, K + 1):
    lam = 2 * np.pi * k / T
    X_train = np.column_stack((X_train,
                               np.cos(lam * t_train),
                               np.sin(lam * t_train)))

theta_hat = np.linalg.lstsq(X_train, y_fit, rcond=None)[0]

# Design matrix for testing
X_test = np.column_stack([np.ones_like(t_test), t_test])
for k in range(1, K + 1):
    lam = 2 * np.pi * k / T
    X_test = np.column_stack((X_test,
                              np.cos(lam * t_test),
                              np.sin(lam * t_test)))

fourier_forecast = X_test @ theta_hat

# -------------------------
# Step 4: Prophet Model
# -------------------------
prophet_df = y_train.reset_index()
prophet_df.columns = ['ds', 'y']
prophet_model = Prophet(yearly_seasonality=True, daily_seasonality=False, weekly_seasonality=False)
prophet_model.fit(prophet_df)

future = prophet_model.make_future_dataframe(periods=60, freq='MS')
prophet_forecast_df = prophet_model.predict(future)
prophet_pred = prophet_forecast_df.loc[future['ds'] >= test_start].set_index('ds').loc[test_start:test_end]['yhat'].values

# -------------------------
# Step 5: Metrics
# -------------------------
y_true = y_test.values
results = {}
for model_name, pred in zip(
    ['SARIMA', 'Fourier + Trend', 'Prophet'],
    [sarima_forecast, fourier_forecast, prophet_pred]
):
    mse = mean_squared_error(y_true, pred)
    mae = mean_absolute_error(y_true, pred)
    rmse = np.sqrt(mse)
    results[model_name] = {'MSE': mse, 'MAE': mae, 'RMSE': rmse}

results_df = pd.DataFrame(results).T.round(4)

# -------------------------
# Step 6: Display + Plot
# -------------------------
import IPython.display as disp
disp.display(results_df)

# Plot
plt.figure(figsize=(14, 6))
plt.plot(y_test.index, y_true, label='Actual', color='black', linewidth=2)
plt.plot(y_test.index, sarima_forecast, label='SARIMA')
plt.plot(y_test.index, fourier_forecast, label='Fourier + Trend')
plt.plot(y_test.index, prophet_pred, label='Prophet')
plt.title("Forecast Comparison (Test Period: 2011–2015)")
plt.xlabel("Date")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency MS will be used.
  self._init_dates(dates, freq)
/usr/local/lib/python3.11/dist-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  warnings.warn("Maximum Likelihood optimization failed to "
DEBUG:cmdstanpy:input tempfile: /tmp/tmpntoh991y/68n0ejy7.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpntoh991y/si3ct1v6.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=25809', 'data', 'file=/tmp/tmpntoh991y/68n0ejy7.json', 'init=/tmp/tmpntoh991y/si3ct1v6.json', 'output', 'file=/tmp/tmpntoh991y/prophet_modelcibgwqzj/prophet_model-20250406120255.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
12:02:55 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
12:02:56 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
MSE MAE RMSE
SARIMA 0.0873 0.2187 0.2955
Fourier + Trend 0.1687 0.3490 0.4107
Prophet 0.0914 0.2314 0.3023
No description has been provided for this image

part 3 :¶

In [82]:
# === Step 1: Load mixed dataset and extract year/month ===
df = pd.read_csv('mixed_dataset.csv', parse_dates=['dt'])

# Extract year and month
df['year'] = df['dt'].dt.year
df['month'] = df['dt'].dt.month

# Filter to 1979–2015
df = df[(df['year'] >= 1979) & (df['year'] <= 2015)]

# === Step 2: Rename columns to match old structure ===
df = df.rename(columns={'dt': 'Date', 'LandAverageTemperature': 'LandAvgTemp', 'CO2_Trend': 'trend'})

# === Step 3: Select and sort
merged_df = df[['Date', 'LandAvgTemp', 'trend']]
merged_df.sort_values(by='Date', inplace=True)

# Optional: Reset index
merged_df.reset_index(drop=True, inplace=True)

# Preview
print(merged_df.head())
print(merged_df.tail())
print(f"Total rows: {len(merged_df)}")
        Date  LandAvgTemp   trend
0 1979-01-01        2.679  335.92
1 1979-02-01        2.841  336.26
2 1979-03-01        5.474  336.51
3 1979-04-01        8.455  336.72
4 1979-05-01       11.199  336.71
          Date  LandAvgTemp   trend
439 2015-08-01       14.755  399.85
440 2015-09-01       12.999  400.14
441 2015-10-01       10.801  400.37
442 2015-11-01        7.433  400.68
443 2015-12-01        5.518  401.17
Total rows: 444
<ipython-input-82-344408de961d>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df.sort_values(by='Date', inplace=True)
In [83]:
# Step 1: Extract target and features from merged_df
y = merged_df['LandAvgTemp'].values
n = len(y)
T = 12  # Monthly seasonality
K = 6   # Number of harmonics

# Step 2: Create time index
t = np.arange(1, n + 1)

# Step 3: Create Fourier + time + exogenous matrix
X = []

for i in range(n):
    row = [1, t[i]]  # Intercept + linear trend
    for k in range(1, K + 1):
        lam = 2 * np.pi * k / T
        row.append(np.cos(lam * t[i]))
        row.append(np.sin(lam * t[i]))
    row.append(merged_df['trend'].iloc[i])  # Add exogenous COâ‚‚ trend
    X.append(row)

X = np.array(X)

# Preview shape
print("X shape:", X.shape)
print("y shape:", y.shape)
X shape: (444, 15)
y shape: (444,)
In [84]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Step 1: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Step 2: Define model names and dictionary
models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'SVR': SVR(kernel='linear'),
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(objective='reg:squarederror', random_state=42)
}

# Step 3: Fit and evaluate models
model_results = {}
mse_results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_results[name] = y_pred
    mse_results[name] = mean_squared_error(y_test, y_pred)
mse_df = pd.DataFrame.from_dict(mse_results, orient='index', columns=['MSE']).sort_values(by='MSE')
print("Mean Squared Error for each model:")
print(mse_df)
Mean Squared Error for each model:
                   MSE
RandomForest  0.100681
XGBoost       0.140291
DecisionTree  0.173938
Linear        0.175305
Ridge         0.178538
SVR           0.245638
Lasso         4.339406
In [85]:
import matplotlib.pyplot as plt
import numpy as np

# Step 1: Compute absolute time index for all months
t = np.arange(1, len(y) + 1)  # Full series: t = 1 to n

# Step 2: Extract t for the test set
t_test = t[-len(y_test):]  # This will be like 152 to 444

# Step 3: Plot using t_test as x-axis
def plot_preds(y_preds, y_true, title, x_vals):
    plt.figure(figsize=(18, 8))
    for model_name, y_pred in y_preds.items():
        plt.plot(x_vals, y_pred, label=model_name, alpha=0.5)
    plt.plot(x_vals, y_true, label='True', color='orange', linewidth=2)
    plt.xlabel('Month Index (starting from Jan 1979 = 1)')
    plt.ylabel('Temperature (°C)')
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Step 4: Call the function
plot_preds(model_results, y_test, "Model Predictions vs True Temperatures", t_test)
No description has been provided for this image
In [48]:
# === Step 1: Create X_future for forecasting ===

forecast_horizon = 48  # Forecasting 4 years ahead
T = 12                 # Monthly seasonality
K = 6                  # Number of harmonics

# Time index for future months
t_last = len(y)  # Last time index from full data
t_future = np.arange(t_last + 1, t_last + forecast_horizon + 1)

# Extrapolate COâ‚‚ trend linearly
last_trend = merged_df['trend'].iloc[-1]
second_last_trend = merged_df['trend'].iloc[-2]
slope = last_trend - second_last_trend
trend_future = [last_trend + slope * (i + 1) for i in range(forecast_horizon)]

# Build X_future with same structure
X_future = []

for i in range(forecast_horizon):
    row = [1, t_future[i]]  # Intercept + trend
    for k in range(1, K + 1):
        lam = 2 * np.pi * k / T
        row.append(np.cos(lam * t_future[i]))
        row.append(np.sin(lam * t_future[i]))
    row.append(trend_future[i])  # COâ‚‚ trend
    X_future.append(row)

X_future = np.array(X_future)
print("X_future shape:", X_future.shape)
X_future shape: (48, 15)
In [49]:
# === Step 1: Forecast with best model ===
best_model = models['RandomForest']  # or 'XGBoost', etc.
y_future = best_model.predict(X_future)

# === Step 2: Combine full y (real) with forecast ===
y_combined = np.concatenate([y, y_future])

# === Step 3: Create time axis (months from 1 to total length) ===
t_combined = np.arange(1, len(y_combined) + 1)
forecast_start = len(y) + 1  # where forecast begins

# === Step 4: Plot full data with forecast ===
plt.figure(figsize=(14, 6))
plt.plot(t_combined[:len(y)], y, label='Observed', color='blue', linewidth=2)
plt.plot(t_combined[len(y):], y_future, label='Forecast (Next 4 Years)', color='purple', linestyle='--', linewidth=2)
plt.axvline(x=forecast_start, color='gray', linestyle=':', linewidth=2, label='Forecast Start')

plt.title('Full Time Series with Forecast (Observed + Predicted)', fontsize=16)
plt.xlabel('Time (Months from Jan 1979)')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [50]:
# Step 1: Get residuals from test set
y_pred_test = model_results['RandomForest']  # use 'Linear' or others if needed
residuals = y_test - y_pred_test

# Step 2: Plot residuals over time
plt.figure(figsize=(12, 4))
plt.plot(residuals, label='Residuals', color='darkred')
plt.axhline(0, color='gray', linestyle='--')
plt.title('Residuals Over Time')
plt.xlabel('Time (Test Set)')
plt.ylabel('Residual')
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [51]:
# Step 3: ACF plot of residuals
plt.figure(figsize=(8, 4))
plot_acf(residuals, lags=24)
plt.title("ACF of Residuals")
plt.tight_layout()
plt.show()
<Figure size 800x400 with 0 Axes>
No description has been provided for this image
In [52]:
# Step 4: Histogram + KDE
plt.figure(figsize=(8, 4))
sns.histplot(residuals, kde=True, color='navy', bins=20)
plt.axvline(0, color='gray', linestyle='--')
plt.title("Histogram of Residuals")
plt.xlabel("Residual")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [53]:
# Step 5: QQ Plot
plt.figure(figsize=(6, 6))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title("QQ Plot of Residuals")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [86]:
# Step 6: Mean of residuals
print("Mean of residuals:", np.mean(residuals))
Mean of residuals: 0.05312505617977758
In [55]:
# Step 1: Create X_no_exog (no COâ‚‚ trend)

X_no_exog = []

for i in range(len(y)):
    row = [1, t[i]]  # Intercept + time
    for k in range(1, K + 1):
        lam = 2 * np.pi * k / T
        row.append(np.cos(lam * t[i]))
        row.append(np.sin(lam * t[i]))
    X_no_exog.append(row)

X_no_exog = np.array(X_no_exog)

print("Shape of X_no_exog:", X_no_exog.shape)
Shape of X_no_exog: (444, 14)
In [56]:
# Step 2: Split again (same way)
Xn_train, Xn_test = X_no_exog[:len(X_train)], X_no_exog[len(X_train):]

# Step 3: Train same models
models_no_exog = {
    'Linear': LinearRegression(),
    'RandomForest': RandomForestRegressor()
}

mse_no_exog = {}

for name, model in models_no_exog.items():
    model.fit(Xn_train, y_train)
    y_pred = model.predict(Xn_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_no_exog[name] = mse
In [57]:
# Step 4: Compare with and without trend
mse_with_trend = {
    'Linear': mean_squared_error(y_test, model_results['Linear']),
    'RandomForest': mean_squared_error(y_test, model_results['RandomForest'])
}

# Combine into a table
comparison_df = pd.DataFrame({
    'With COâ‚‚ Trend': mse_with_trend,
    'Without COâ‚‚ Trend': mse_no_exog
})

print("MSE Comparison (With vs. Without Exogenous Variable):")
print(comparison_df)
MSE Comparison (With vs. Without Exogenous Variable):
              With COâ‚‚ Trend  Without COâ‚‚ Trend
Linear              0.175305           0.085482
RandomForest        0.100211           0.101441
In [58]:
#Function to compute BIC for Linear Regression
def compute_bic(y_true, y_pred, num_params):
    n = len(y_true)
    rss = np.sum((y_true - y_pred) ** 2)
    bic = n * np.log(rss / n) + num_params * np.log(n)
    return bic

# Number of parameters (intercept + coefficients)
k_with = X_train.shape[1] + 1
k_without = Xn_train.shape[1] + 1

# Refit models to make sure predictions are aligned
linear_with = LinearRegression()
linear_with.fit(X_train, y_train)
y_pred_with = linear_with.predict(X_test)

linear_no_exog = LinearRegression()
linear_no_exog.fit(Xn_train, y_train)
y_pred_without = linear_no_exog.predict(Xn_test)

# Compute BICs
bic_with = compute_bic(y_test, y_pred_with, k_with)
bic_without = compute_bic(y_test, y_pred_without, k_without)

print("BIC Comparison (Linear Regression)")
print(f"With COâ‚‚ trend     : {bic_with:.2f}")
print(f"Without COâ‚‚ trend  : {bic_without:.2f}")
BIC Comparison (Linear Regression)
With COâ‚‚ trend     : -83.15
Without COâ‚‚ trend  : -151.56
In [59]:
# Make sure you're using the merged dataset from 1979 to 2015
corr = merged_df[['LandAvgTemp', 'trend']].corr()
print("Correlation between Temperature and COâ‚‚ Trend:")
print(corr)
Correlation between Temperature and COâ‚‚ Trend:
             LandAvgTemp     trend
LandAvgTemp     1.000000  0.076056
trend           0.076056  1.000000
In [75]:
plt.figure(figsize=(7, 5))
sns.regplot(data=merged_df, x='trend', y='LandAvgTemp', line_kws={'color': 'red'})
plt.title("Scatter Plot: COâ‚‚ Trend vs. Land Average Temperature")
plt.xlabel("COâ‚‚ Trend")
plt.ylabel("Land Average Temperature (°C)")
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [78]:
from sklearn.metrics import mean_absolute_error
# Ensure you're using predictions from consistent splits:
# Refit Linear WITH trend
linear_with = LinearRegression()
linear_with.fit(X_train, y_train)
y_pred_with = linear_with.predict(X_test)

# Refit Linear WITHOUT trend
linear_no_exog = LinearRegression()
linear_no_exog.fit(Xn_train, y_train)
y_pred_without = linear_no_exog.predict(Xn_test)

# Calculate MAE
mae_with = mean_absolute_error(y_test, y_pred_with)
mae_without = mean_absolute_error(y_test, y_pred_without)

# Print comparison
print("MAE Comparison (Linear Regression)")
print(f"With COâ‚‚ trend     : {mae_with:.4f}")
print(f"Without COâ‚‚ trend  : {mae_without:.4f}")
MAE Comparison (Linear Regression)
With COâ‚‚ trend     : 0.3384
Without COâ‚‚ trend  : 0.2198
In [63]:
# Step 1: Get existing predictions
y_pred_rf_with = model_results['RandomForest']                     # With COâ‚‚
y_pred_rf_without = models_no_exog['RandomForest'].predict(Xn_test)  # Without COâ‚‚

# Step 2: Compute MAE
mae_rf_with = mean_absolute_error(y_test, y_pred_rf_with)
mae_rf_without = mean_absolute_error(y_test, y_pred_rf_without)

# Step 3: Display results
print("MAE Comparison (Random Forest)")
print(f"With COâ‚‚ trend     : {mae_rf_with:.4f}")
print(f"Without COâ‚‚ trend  : {mae_rf_without:.4f}")
MAE Comparison (Random Forest)
With COâ‚‚ trend     : 0.2394
Without COâ‚‚ trend  : 0.2361
In [64]:
# === Refit Linear WITH COâ‚‚ trend ===
linear_with = LinearRegression()
linear_with.fit(X_train, y_train)
y_pred_with = linear_with.predict(X_test)
residuals_with = y_test - y_pred_with

# === Refit Linear WITHOUT COâ‚‚ trend ===
linear_no_exog = LinearRegression()
linear_no_exog.fit(Xn_train, y_train)
y_pred_without = linear_no_exog.predict(Xn_test)
residuals_without = y_test - y_pred_without

# === Plot ACF side-by-side ===
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

plot_acf(residuals_with, lags=24, zero=False, ax=axes[0])
axes[0].set_title("ACF of Residuals (Linear WITH COâ‚‚ trend)")

plot_acf(residuals_without, lags=24, zero=False, ax=axes[1])
axes[1].set_title("ACF of Residuals (Linear WITHOUT COâ‚‚ trend)")

plt.tight_layout()
plt.show()
No description has been provided for this image

part 4:¶

In [65]:
residual = aligned_data - aligned_trend - seasonality
# Plot the ACF of the residuals
plt.figure(figsize=(10, 4))
plot_acf(residual, lags=48)
plt.title("Autocorrelation Function (ACF) of Residuals")
plt.xlabel("Lag (Months)")
plt.ylabel("Autocorrelation")
plt.grid(True)
plt.tight_layout()
plt.show()
<Figure size 1000x400 with 0 Axes>
No description has been provided for this image
In [66]:
# Step 1: Split residual into 12 monthly series and align by year

# Create a dictionary to hold months (1=Jan, ..., 12=Dec)
residual_months = {i: [] for i in range(1, 13)}

# Group residuals by month
for date, value in residual.items():
    month = date.month
    residual_months[month].append((date, value))

# Convert each month's data into a time series
residual_month_dfs = {}
for month in range(1, 13):
    data = residual_months[month]
    dates_m, values = zip(*data)
    series = pd.Series(data=values, index=pd.to_datetime(dates_m)).sort_index()
    residual_month_dfs[month] = series

# Identify years present in all months
years_by_month = [set(df.index.year) for df in residual_month_dfs.values()]
common_years = sorted(set.intersection(*years_by_month))

# Filter each series to only keep those years
for month in residual_month_dfs:
    df = residual_month_dfs[month]
    residual_month_dfs[month] = df[df.index.year.isin(common_years)]

    # Debug print
    print(f"Month {month}: {len(residual_month_dfs[month])} entries")
    print(f"   Years: {residual_month_dfs[month].index.year.min()} to {residual_month_dfs[month].index.year.max()}")

# ✅ Final confirmation
print(f"Residual series split into 12 monthly groups with {len(common_years)} common years.")
Month 1: 114 entries
   Years: 1901 to 2014
Month 2: 114 entries
   Years: 1901 to 2014
Month 3: 114 entries
   Years: 1901 to 2014
Month 4: 114 entries
   Years: 1901 to 2014
Month 5: 114 entries
   Years: 1901 to 2014
Month 6: 114 entries
   Years: 1901 to 2014
Month 7: 114 entries
   Years: 1901 to 2014
Month 8: 114 entries
   Years: 1901 to 2014
Month 9: 114 entries
   Years: 1901 to 2014
Month 10: 114 entries
   Years: 1901 to 2014
Month 11: 114 entries
   Years: 1901 to 2014
Month 12: 114 entries
   Years: 1901 to 2014
Residual series split into 12 monthly groups with 114 common years.
In [67]:
# Plot ACF for each month's residuals
fig, axes = plt.subplots(4, 3, figsize=(16, 12))  # 12 subplots: 4 rows × 3 columns
fig.suptitle("ACF of Monthly Residuals", fontsize=18)

for month, ax in zip(range(1, 13), axes.flatten()):
    series = residual_month_dfs[month]
    plot_acf(series, ax=ax, lags=20, zero=False, title=f"Month {month}", alpha=0.05)
    ax.set_xlabel("Lag")
    ax.set_ylabel("ACF")

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()
No description has been provided for this image
In [68]:
# Suggest CUSUM parameters (k, h) per month based on monthly residual std

cusum_params_by_month = {}

print("Suggested CUSUM parameters per month:\n")
print(f"{'Month':^7} | {'σ':^8} | {'Δ = 0.4σ':^10} | {'k = Δ/2':^9} | {'log(B)':^9}")
print("-" * 50)

for month in range(1, 13):
    series = residual_month_dfs[month]
    sigma = series.std()
    delta = 0.4 * sigma
    k = delta / 2
    h = 5 * sigma
    logB = np.log(h)  # <== use log threshold instead

    cusum_params_by_month[month] = {'sigma': sigma, 'k': k, 'logB': logB}

    print(f"{month:^7} | {sigma:.5f} | {delta:.5f}   | {k:.5f}  | {logB:.5f}")
Suggested CUSUM parameters per month:

 Month  |    σ     |  Δ = 0.4σ  |  k = Δ/2  |  log(B)  
--------------------------------------------------
   1    | 0.34408 | 0.13763   | 0.06882  | 0.54257
   2    | 0.38512 | 0.15405   | 0.07702  | 0.65523
   3    | 0.32179 | 0.12872   | 0.06436  | 0.47558
   4    | 0.22265 | 0.08906   | 0.04453  | 0.10727
   5    | 0.20215 | 0.08086   | 0.04043  | 0.01069
   6    | 0.17228 | 0.06891   | 0.03446  | -0.14921
   7    | 0.18880 | 0.07552   | 0.03776  | -0.05763
   8    | 0.19209 | 0.07683   | 0.03842  | -0.04038
   9    | 0.18717 | 0.07487   | 0.03743  | -0.06632
  10    | 0.24425 | 0.09770   | 0.04885  | 0.19986
  11    | 0.28058 | 0.11223   | 0.05612  | 0.33855
  12    | 0.33923 | 0.13569   | 0.06785  | 0.52837
In [69]:
# Step 2: Define the CUSUM function and score change-points per year
def apply_cusum(series, k, h):
    """Applies CUSUM to a time series and returns detected change-point dates."""
    values = series.values
    W = np.zeros(len(values))
    change_points = []

    for t in range(1, len(values)):
        W[t] = max(0, W[t - 1] + values[t] - k)
        if W[t] > h:
            change_points.append(series.index[t])
            W[t] = 0  # reset after detecting change

    return change_points

# Initialize score dictionary for each year
year_scores = {year: 0 for year in common_years}

#Apply CUSUM to each month's residuals using its own k and h
for month in range(1, 13):
    series = residual_month_dfs[month]
    k = cusum_params_by_month[month]['k']
    h = cusum_params_by_month[month]['logB']

    detected_cps = apply_cusum(series, k, h)
    for cp in detected_cps:
        if cp.year in year_scores:
            year_scores[cp.year] += 1
In [71]:
# Step 3: Apply a threshold to detect change years

# Define threshold: how many months in a year must have change-points to flag the year
threshold = 7  # you can adjust this to 6 or 9 for sensitivity

# Create a DataFrame to summarize results
results_df = pd.DataFrame({
    'Year': list(year_scores.keys()),
    'Change_Point_Months': list(year_scores.values())
})

# Add binary flag: 1 = global change detected, 0 = not detected
results_df['Detected_Change'] = results_df['Change_Point_Months'] >= threshold
highlighted_years = results_df[results_df['Detected_Change'] == True]

print(f"\nYears with ≥ {threshold} change-point months (Detected_Change = True):")
display(highlighted_years.sort_values(by='Change_Point_Months', ascending=False))
Years with ≥ 7 change-point months (Detected_Change = True):
Year Change_Point_Months Detected_Change
85 1986 8 True
77 1978 8 True
22 1923 7 True
25 1926 7 True
37 1938 7 True
31 1932 7 True
62 1963 7 True
52 1953 7 True
82 1983 7 True
92 1993 7 True
106 2007 7 True
109 2010 7 True
In [72]:
# Step 1: Filter to keep data from year 1901 onwards
filtered_data = aligned_data[aligned_data.index.year >= 1901]

# Compute average temperature per year
yearly_avg_temp = filtered_data.groupby(filtered_data.index.year).mean()

# Step 2: Extract years with ≥ 7 and = 6 change-point months
strong_change_years = results_df[results_df['Change_Point_Months'] >= 7]['Year'].tolist()
moderate_change_years = results_df[results_df['Change_Point_Months'] == 6]['Year'].tolist()

# Step 3: Plot the average temperature
plt.figure(figsize=(14, 6))
plt.plot(yearly_avg_temp.index, yearly_avg_temp.values, label='Average Temperature', color='black', linewidth=2)

#Strong change-point years (≥ 7)
for i, year in enumerate(strong_change_years):
    if year in yearly_avg_temp.index:
        plt.scatter(year, yearly_avg_temp.loc[year], color='red', s=80,
                    label='Strong Change Year (≥7)' if i == 0 else "")

#Moderate change-point years (= 6)
for i, year in enumerate(moderate_change_years):
    if year in yearly_avg_temp.index:
        plt.scatter(year, yearly_avg_temp.loc[year], color='orange', s=80,
                    label='Moderate Change Year (=6)' if i == 0 else "")

# Formatting
plt.title("Average Yearly Temperature with Detected Change Years", fontsize=16)
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image